In [1]:
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
import statsmodels
In [2]:
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf
In [3]:
someBooks, _ = aBookCollection.exclude_authors_with_less_than(10).sample_authors(5).split_at_number_per_author(10)
aPossibleFeatureAnalyzer = bc.PossibleFeatureAnalyzer.from_book_collection(someBooks)
In [4]:
freqDf = aPossibleFeatureAnalyzer.frequencies().dataframe_total()
In [5]:
freqDf.apply(numpy.log10).plot(kind='kde')
Out[5]:
In [6]:
import statsmodels.graphics.gofplots as gp
import scipy.stats
_ = gp.qqplot(freqDf.Value.apply(numpy.log10), scipy.stats.distributions.uniform())
In [7]:
entrDf = aPossibleFeatureAnalyzer.entropies().dataframe_total()
entrDf.plot(kind='kde')
Out[7]:
In [8]:
_ = gp.qqplot(entrDf.Value, scipy.stats.distributions.uniform())
In [9]:
#blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0.35, 1)
blah = aPossibleFeatureAnalyzer.prune_frequencies_quantiles(0, 1)
In [10]:
freqDf2 = blah.frequencies().dataframe_total()
In [11]:
freqDf2.apply(numpy.log10).plot(kind='kde')
Out[11]:
In [12]:
_ = gp.qqplot(freqDf2.Value.apply(numpy.log10), scipy.stats.distributions.uniform())
In [13]:
entrDf2 = blah.entropies().dataframe_total()
In [14]:
entrDf2.plot(kind='kde')
Out[14]:
In [15]:
entrDf2.sort('Value').head(10)
Out[15]:
In [16]:
plt.figsize(10, 6)
entrPnl2 = blah.entropies().dataframe_authors()
entrPnl2.hist()
Out[16]:
In [17]:
freqPnl2 = blah.frequencies().dataframe_authors()
freqPnl2.hist(log=True)
Out[17]:
In [18]:
mydata = []
df = blah.entropies().dataframe_authors()
for col in df:
arr = [df[col].dropna().quantile(v/50) for v in range(50)]
mydata.append(arr)
In [19]:
import statsmodels.api as sm
sm.graphics.fboxplot(mydata)
Out[19]:
In [ ]: